{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "import os\n",
    "import pickle\n",
    "from pprint import pprint\n",
    "import jieba\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import (GridSearchCV, cross_val_score, train_test_split)\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.semi_supervised import LabelSpreading\n",
    "from scipy import stats\n",
    "from model.db import DB_ENGINE, rawcontents\n",
    "from sqlalchemy import update\n",
    "from utils.log import getLogger\n",
    "from sklearn.linear_model import SGDClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "logger = getLogger('semiTrain')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetchAllData():\n",
    "    return pd.read_sql(\n",
    "        'SELECT rid, tag, assure FROM rawcontents',\n",
    "        DB_ENGINE\n",
    "    )\n",
    "    return raw_contents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "def selectTrainData(data, unlabeled):\n",
    "    unlabeled_data = data[data['assure'] < 0.5].copy()\n",
    "    labeled_data = data[data['assure'] > 0.5].copy()\n",
    "\n",
    "    inds = np.arange(len(unlabeled_data))\n",
    "    np.random.shuffle(inds)\n",
    "\n",
    "    return labeled_data.append(unlabeled_data.iloc[inds[:unlabeled]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "def completeTrainData(current):\n",
    "    stmt = 'SELECT rid, content, vector FROM rawcontents WHERE rid IN {}'\n",
    "\n",
    "    with DB_ENGINE.connect() as conn:\n",
    "        traindata = current.merge(\n",
    "            pd.DataFrame(\n",
    "                conn.execute(\n",
    "                    stmt.format(tuple(current['rid'].values))\n",
    "                ).fetchall(),\n",
    "                columns=['rid', 'content', 'vector']\n",
    "            ),\n",
    "            on='rid'\n",
    "        )\n",
    "\n",
    "    traindata['ss'] = list(map(\n",
    "        lambda x: -1 if x[0] < 0.5 else x[1],\n",
    "        zip(traindata['assure'], traindata['tag'])\n",
    "    ))\n",
    "    traindata['vector'] = traindata['vector'].apply(pickle.loads)\n",
    "\n",
    "    return traindata.set_index(np.arange(len(current)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extractData(df):\n",
    "    return list(df['vector'].values), list(df['tag'].values) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info('Read Database ...')\n",
    "raw_contents = fetchAllData()\n",
    "logger.info('Complete.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.info('Read train data ...')\n",
    "current = selectTrainData(raw_contents, 2048)\n",
    "current = completeTrainData(current)\n",
    "logger.info('Complete')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "labeled = current[current['assure'] > 0.5].copy()\n",
    "unlabeled = current[current['assure'] < 0.5].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    labeled['vector'].values, \n",
    "    labeled['tag'].values, \n",
    "    random_state = ord(os.urandom(1))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = SGDClassifier(\n",
    "    random_state=ord(os.urandom(1)), \n",
    "    max_iter=512, \n",
    "    tol=1e-3,\n",
    "    penalty='elasticnet',\n",
    "    loss='modified_huber',\n",
    "    fit_intercept=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.98436647, 0.98739392, 0.98803911, 0.98798888, 0.97895468])"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cross_val_score(\n",
    "    clf,\n",
    "    list(labeled['vector'].values), \n",
    "    list(labeled['tag'].values), \n",
    "    cv=5\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_labeled, y_labeled = extractData(labeled)\n",
    "X_unlabeled, y_unlabeled = extractData(unlabeled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n",
       "       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=False,\n",
       "       l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',\n",
       "       max_iter=512, n_iter=None, n_iter_no_change=5, n_jobs=None,\n",
       "       penalty='elasticnet', power_t=0.5, random_state=229, shuffle=True,\n",
       "       tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf_fit_result = clf.fit(X_labeled, y_labeled)\n",
    "clf_fit_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predict = clf_fit_result.predict(X_unlabeled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rid</th>\n",
       "      <th>content_x</th>\n",
       "      <th>tag</th>\n",
       "      <th>assure</th>\n",
       "      <th>content_y</th>\n",
       "      <th>vector</th>\n",
       "      <th>ss</th>\n",
       "      <th>predict</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>100742</th>\n",
       "      <td>59691</td>\n",
       "      <td>支持楼主，楼主分析的不错</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>支持楼主，楼主分析的不错</td>\n",
       "      <td>[-0.00050121656, 0.004646495, 0.00049711263, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100743</th>\n",
       "      <td>1074015</td>\n",
       "      <td>有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？</td>\n",
       "      <td>[-0.00021289136, -0.0002112096, 0.00020131774,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100744</th>\n",
       "      <td>605732</td>\n",
       "      <td>假如你是当官的，下场会比秦桧惨</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>假如你是当官的，下场会比秦桧惨</td>\n",
       "      <td>[-0.00041641563, 0.004448106, 0.0007133413, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100745</th>\n",
       "      <td>698948</td>\n",
       "      <td>呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...</td>\n",
       "      <td>[-0.0001965089, 0.0002559799, -0.00016521648, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100746</th>\n",
       "      <td>732077</td>\n",
       "      <td>结伴走天涯，点赞去。\\n   \\n\\n    抢红包</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>结伴走天涯，点赞去。\\n   \\n\\n    抢红包</td>\n",
       "      <td>[-0.00012122595, -0.00019975816, -9.6026626e-0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100747</th>\n",
       "      <td>604329</td>\n",
       "      <td>@匆匆那年2018ABC    2018-08-19 11:02:37\\n \\n 接近税务深...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>@匆匆那年2018ABC    2018-08-19 11:02:37\\n \\n 接近税务深...</td>\n",
       "      <td>[-0.00015112055, -0.00021713384, 4.3946588e-05...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100748</th>\n",
       "      <td>790591</td>\n",
       "      <td>我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\\n \\n\\n 在春天...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\\n \\n\\n 在春天...</td>\n",
       "      <td>[0.00014590095, -2.36256e-05, -5.815563e-06, 7...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100749</th>\n",
       "      <td>1323314</td>\n",
       "      <td>靠一个洗剪吹的tony老师来吹风放消息 ，离死不远</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>靠一个洗剪吹的tony老师来吹风放消息 ，离死不远</td>\n",
       "      <td>[2.177861e-06, 0.00014500064, -2.8838269e-05, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100750</th>\n",
       "      <td>95676</td>\n",
       "      <td>诺贝尔崔崔\\n  \\n  抢到了\\n  \\n   元素yz\\n  \\n  的红包，价值0.0...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>诺贝尔崔崔\\n  \\n  抢到了\\n  \\n   元素yz\\n  \\n  的红包，价值0.0...</td>\n",
       "      <td>[-0.00053806993, 0.004573738, 0.00034990904, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100751</th>\n",
       "      <td>176206</td>\n",
       "      <td>EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...</td>\n",
       "      <td>[5.1506522e-05, -5.8857413e-06, 1.2539855e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100752</th>\n",
       "      <td>952575</td>\n",
       "      <td>光天之下竞然还有如此冤案。期待党和政府还他们公正…</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>光天之下竞然还有如此冤案。期待党和政府还他们公正…</td>\n",
       "      <td>[5.0086005e-06, -5.9196904e-05, 0.00014237005,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100753</th>\n",
       "      <td>1572078</td>\n",
       "      <td>第五章  万物平等，因此人当建设平等社会\\n \\n\\n 天地不仁，以万物为刍狗；圣人不仁，以...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>第五章  万物平等，因此人当建设平等社会\\n \\n\\n 天地不仁，以万物为刍狗；圣人不仁，以...</td>\n",
       "      <td>[0.00023834412, 0.00014466436, 9.70999e-06, -9...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100754</th>\n",
       "      <td>808153</td>\n",
       "      <td>@半岛之南2017    2018-10-11 15:08:03\\n \\n 假如真是这样，这...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>@半岛之南2017    2018-10-11 15:08:03\\n \\n 假如真是这样，这...</td>\n",
       "      <td>[-0.00022901272, 0.00014568881, 5.5341217e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100755</th>\n",
       "      <td>447225</td>\n",
       "      <td>今日天涯分价格：0.07元；今日点赞分是：20.96；持钻收益比为：9.26%\\n \\n 今...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>今日天涯分价格：0.07元；今日点赞分是：20.96；持钻收益比为：9.26%\\n \\n 今...</td>\n",
       "      <td>[-0.00020383063, 0.00023106053, -9.111098e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100756</th>\n",
       "      <td>438156</td>\n",
       "      <td>@老糊涂_001 2018-09-13 11:52:58\\n \\n 赞一个！\\n \\n --...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>@老糊涂_001 2018-09-13 11:52:58\\n \\n 赞一个！\\n \\n --...</td>\n",
       "      <td>[9.605871e-05, -0.00023950999, 5.6189332e-05, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100757</th>\n",
       "      <td>475002</td>\n",
       "      <td>主贴已赞的请回</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>主贴已赞的请回</td>\n",
       "      <td>[-0.0009158363, 0.0066905254, 0.0005661698, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100758</th>\n",
       "      <td>1397042</td>\n",
       "      <td>这个，我不同意楼主。\\n \\n 专利保护期过来为什么不能仿制？</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>这个，我不同意楼主。\\n \\n 专利保护期过来为什么不能仿制？</td>\n",
       "      <td>[-0.00015296177, -0.00022891088, -0.0001389691...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100759</th>\n",
       "      <td>610691</td>\n",
       "      <td>道德沦丧不是一个人造成的，也不是一小部分人造成的，而是多数人共同作用的结果。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>道德沦丧不是一个人造成的，也不是一小部分人造成的，而是多数人共同作用的结果。</td>\n",
       "      <td>[7.515958e-05, 9.5837815e-05, -0.00018056587, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100760</th>\n",
       "      <td>1128018</td>\n",
       "      <td>马克</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>马克</td>\n",
       "      <td>[0.00017363396, 0.00016908465, 0.00012319739, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100761</th>\n",
       "      <td>498641</td>\n",
       "      <td>因为位置的价格不一样,谁出的钱多谁就占好位置,还有活动专区</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>因为位置的价格不一样,谁出的钱多谁就占好位置,还有活动专区</td>\n",
       "      <td>[-0.00016200649, 8.652916e-05, 6.718655e-05, 8...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100762</th>\n",
       "      <td>1183017</td>\n",
       "      <td>我宝贝动动来了吗</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>我宝贝动动来了吗</td>\n",
       "      <td>[-0.00049750454, 0.0019457284, 0.00022722762, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100763</th>\n",
       "      <td>980043</td>\n",
       "      <td>楼主宽心，会遇见更好的，你应该高兴</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>楼主宽心，会遇见更好的，你应该高兴</td>\n",
       "      <td>[0.00020053485, -0.0002160742, 0.00017019118, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100764</th>\n",
       "      <td>1241222</td>\n",
       "      <td>Rosie和郭达森先生好甜~~~   儿子也好可爱</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>Rosie和郭达森先生好甜~~~   儿子也好可爱</td>\n",
       "      <td>[-0.00017903057, 0.0001622189, -0.00022155614,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100765</th>\n",
       "      <td>879838</td>\n",
       "      <td>吃完晚饭，一个人穿过厦大，\\n \\n 来了厦大这么久都没有去过后门，\\n \\n 现在走过去看...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>吃完晚饭，一个人穿过厦大，\\n \\n 来了厦大这么久都没有去过后门，\\n \\n 现在走过去看...</td>\n",
       "      <td>[-0.00011312829, -0.00023002345, -0.0001825704...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100766</th>\n",
       "      <td>805046</td>\n",
       "      <td>某报明天会说：生存理念升级了。大家不要大小怪。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>某报明天会说：生存理念升级了。大家不要大小怪。</td>\n",
       "      <td>[-1.365614e-05, 2.8391225e-05, -0.00021587127,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100767</th>\n",
       "      <td>1217917</td>\n",
       "      <td>baby在跑男前不是真正的明星？你是认真的吗？baby参加跑男前在香港多火你知道吗？</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>baby在跑男前不是真正的明星？你是认真的吗？baby参加跑男前在香港多火你知道吗？</td>\n",
       "      <td>[-0.0001810392, 0.00022770114, 0.000116205876,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100768</th>\n",
       "      <td>1122087</td>\n",
       "      <td>所以啊，再次呼吁我们国家电视剧和电影能\\n \\n\\n 赶\\n \\n\\n 紧\\n \\n\\n 分...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>所以啊，再次呼吁我们国家电视剧和电影能\\n \\n\\n 赶\\n \\n\\n 紧\\n \\n\\n 分...</td>\n",
       "      <td>[3.038715e-05, 3.3963806e-05, -1.3680326e-05, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100769</th>\n",
       "      <td>396070</td>\n",
       "      <td>比特币洗盘之后缓缓上涨，莱特币却一直横盘，走势显得比较疲软，</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>比特币洗盘之后缓缓上涨，莱特币却一直横盘，走势显得比较疲软，</td>\n",
       "      <td>[3.211969e-05, -0.00015681396, 1.900563e-05, 0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100770</th>\n",
       "      <td>159172</td>\n",
       "      <td>LUO  ER, 谁 第 249+1   楼呀  ？  嘿嘿嘿</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>LUO  ER, 谁 第 249+1   楼呀  ？  嘿嘿嘿</td>\n",
       "      <td>[-0.00017184012, 1.6387998e-05, -3.682658e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100771</th>\n",
       "      <td>190006</td>\n",
       "      <td>我参与了投票:内心平静,你也来表个态吧~</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>我参与了投票:内心平静,你也来表个态吧~</td>\n",
       "      <td>[5.7232e-05, -4.43334e-06, -0.00019098798, 3.0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102760</th>\n",
       "      <td>380996</td>\n",
       "      <td>楼主好人</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>楼主好人</td>\n",
       "      <td>[1.2418393e-05, -9.22599e-05, -0.00022082537, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102761</th>\n",
       "      <td>309169</td>\n",
       "      <td>大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...</td>\n",
       "      <td>[0.00020369448, 0.00022521541, 0.00019723042, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102762</th>\n",
       "      <td>904441</td>\n",
       "      <td>继续发</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>继续发</td>\n",
       "      <td>[9.0072135e-05, 8.114723e-05, 6.366512e-05, -8...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102763</th>\n",
       "      <td>274543</td>\n",
       "      <td>据说排名第一的钱包地址是天涯的，有几千万个</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>据说排名第一的钱包地址是天涯的，有几千万个</td>\n",
       "      <td>[-0.0004768259, 0.0049025207, 0.0006336634, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102764</th>\n",
       "      <td>1352259</td>\n",
       "      <td>哪个男的是谁</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>哪个男的是谁</td>\n",
       "      <td>[0.00011036501, -8.9452784e-05, 4.2348955e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102765</th>\n",
       "      <td>690753</td>\n",
       "      <td>就像马云说的假货问题是整个社会的问题，不能把问题推给企业了事。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>就像马云说的假货问题是整个社会的问题，不能把问题推给企业了事。</td>\n",
       "      <td>[-0.0008231468, 0.0048623886, 0.0005516676, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102766</th>\n",
       "      <td>467114</td>\n",
       "      <td>借助区块链3.0技术，实物资产将大规模上链，映射为可确权的“数字票证”，这将带来数字经济的爆...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>借助区块链3.0技术，实物资产将大规模上链，映射为可确权的“数字票证”，这将带来数字经济的爆...</td>\n",
       "      <td>[-6.13557e-05, -0.00015381981, 0.0001829403, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102767</th>\n",
       "      <td>580173</td>\n",
       "      <td>打好防守攻坚战</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>打好防守攻坚战</td>\n",
       "      <td>[-6.63767e-05, -0.0001055677, 0.00015874115, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102768</th>\n",
       "      <td>1127844</td>\n",
       "      <td>这个女人是谁</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>这个女人是谁</td>\n",
       "      <td>[-4.789428e-05, -0.00020672627, -0.00013019978...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102769</th>\n",
       "      <td>744679</td>\n",
       "      <td>顶起来，为正义喝彩！</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>顶起来，为正义喝彩！</td>\n",
       "      <td>[-0.00084950624, 0.0048679304, 0.0006878436, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102770</th>\n",
       "      <td>1491497</td>\n",
       "      <td>黑猫白猫抖一抖，公蜘母蛛满地走。\\n \\n 蛀虫多在体制内，尤其宣传教育口。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>黑猫白猫抖一抖，公蜘母蛛满地走。\\n \\n 蛀虫多在体制内，尤其宣传教育口。</td>\n",
       "      <td>[-4.11761e-05, -3.5230845e-05, 0.00012575409, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102771</th>\n",
       "      <td>926404</td>\n",
       "      <td>公元：2018年12月6日11时13分25秒 阴8局\\n \\n 农历：2018年10月29日...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>公元：2018年12月6日11时13分25秒 阴8局\\n \\n 农历：2018年10月29日...</td>\n",
       "      <td>[0.00012963584, 0.00012855428, -0.00014557735,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102772</th>\n",
       "      <td>357229</td>\n",
       "      <td>你都没钻了，怎么点赞啊</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>你都没钻了，怎么点赞啊</td>\n",
       "      <td>[-0.0001524354, 6.217486e-05, 0.00019323081, 0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102773</th>\n",
       "      <td>1136609</td>\n",
       "      <td>但恩桐坚持，云告天只得加了一条附加条款。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>但恩桐坚持，云告天只得加了一条附加条款。</td>\n",
       "      <td>[0.00020328503, 0.00039947787, 3.8845763e-05, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102774</th>\n",
       "      <td>344996</td>\n",
       "      <td>给你点赞了，以能量换能量不占便宜不吃亏</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>给你点赞了，以能量换能量不占便宜不吃亏</td>\n",
       "      <td>[-0.00017894346, -0.0002214486, 0.00011141604,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102775</th>\n",
       "      <td>608485</td>\n",
       "      <td>不管你说什么，就是不买</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>不管你说什么，就是不买</td>\n",
       "      <td>[-0.0005581288, 0.004712583, 0.0004242073, -0....</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102776</th>\n",
       "      <td>1553392</td>\n",
       "      <td>强盗来了</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>强盗来了</td>\n",
       "      <td>[0.0001694109, -0.00023546406, -3.9594e-05, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102777</th>\n",
       "      <td>309368</td>\n",
       "      <td>大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...</td>\n",
       "      <td>[0.00020369448, 0.00022521541, 0.00019723042, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102778</th>\n",
       "      <td>396154</td>\n",
       "      <td>@灿若星辰9 2018-09-12 08:45:28\\n \\n 全天在线，赞我秒回红包\\n ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>@灿若星辰9 2018-09-12 08:45:28\\n \\n 全天在线，赞我秒回红包\\n ...</td>\n",
       "      <td>[0.00017237266, -0.00012719107, 8.0563535e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102779</th>\n",
       "      <td>273838</td>\n",
       "      <td>区块链这个新型的技术是不会被杀死的，任何一个新的东西出来都会经历市场的历练，就跟当时的互联网...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>区块链这个新型的技术是不会被杀死的，任何一个新的东西出来都会经历市场的历练，就跟当时的互联网...</td>\n",
       "      <td>[-1.5643005e-05, -0.0001529364, -0.00017457732...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102780</th>\n",
       "      <td>933624</td>\n",
       "      <td>有饛簋飧,有捄棘匕。周道如砥,其直如矢。君子所履,小人所视。眷言顾之,潸焉出涕。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>有饛簋飧,有捄棘匕。周道如砥,其直如矢。君子所履,小人所视。眷言顾之,潸焉出涕。</td>\n",
       "      <td>[-0.00015580615, 0.00013681517, 0.00010879679,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102781</th>\n",
       "      <td>1098546</td>\n",
       "      <td>所以女人的终极目标是要嫁人生子？</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>所以女人的终极目标是要嫁人生子？</td>\n",
       "      <td>[-0.00018765307, -3.1531177e-05, 0.00013083006...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102782</th>\n",
       "      <td>575655</td>\n",
       "      <td>最近比较忙，上贴吧少，可以联系老师扣扣或威信2563454870</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>最近比较忙，上贴吧少，可以联系老师扣扣或威信2563454870</td>\n",
       "      <td>[1.0329338e-05, -5.8130994e-05, 4.8842303e-05,...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102783</th>\n",
       "      <td>851508</td>\n",
       "      <td>鲁创已回红包</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>鲁创已回红包</td>\n",
       "      <td>[-8.7861634e-05, 0.00013935432, -2.9601068e-05...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102784</th>\n",
       "      <td>729111</td>\n",
       "      <td>只有标题，本文没几个字！鉴定完毕水贴一张</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>只有标题，本文没几个字！鉴定完毕水贴一张</td>\n",
       "      <td>[-0.00045963874, 0.004632451, 0.0003776469, -0...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102785</th>\n",
       "      <td>1284232</td>\n",
       "      <td>主要是人设是美男子，他颜值不够，演的又太过用力，本身角色就挺惹人讨厌，无法让人心生怜惜</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>主要是人设是美男子，他颜值不够，演的又太过用力，本身角色就挺惹人讨厌，无法让人心生怜惜</td>\n",
       "      <td>[-0.00015876505, -3.1368076e-05, -0.0001053466...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102786</th>\n",
       "      <td>579520</td>\n",
       "      <td>你的命都可以被管控，何况是房产</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>你的命都可以被管控，何况是房产</td>\n",
       "      <td>[-0.00041723697, 0.004586449, 0.00037609783, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102787</th>\n",
       "      <td>1203790</td>\n",
       "      <td>明星演的看着假像看过了的，不精彩了。</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>明星演的看着假像看过了的，不精彩了。</td>\n",
       "      <td>[-0.0004734579, 0.0047924956, 0.00065743795, -...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102788</th>\n",
       "      <td>181214</td>\n",
       "      <td>顶贴</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>顶贴</td>\n",
       "      <td>[-0.00015331614, -1.1328045e-05, 0.00021822979...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102789</th>\n",
       "      <td>965661</td>\n",
       "      <td>谁能人肉一下</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>谁能人肉一下</td>\n",
       "      <td>[-6.006425e-05, 7.563771e-06, -2.4452002e-05, ...</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2048 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            rid                                          content_x  tag  \\\n",
       "100742    59691                                       支持楼主，楼主分析的不错  NaN   \n",
       "100743  1074015                    有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？  NaN   \n",
       "100744   605732                                    假如你是当官的，下场会比秦桧惨  NaN   \n",
       "100745   698948  呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...  NaN   \n",
       "100746   732077                         结伴走天涯，点赞去。\\n   \\n\\n    抢红包  NaN   \n",
       "100747   604329  @匆匆那年2018ABC    2018-08-19 11:02:37\\n \\n 接近税务深...  NaN   \n",
       "100748   790591  我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\\n \\n\\n 在春天...  NaN   \n",
       "100749  1323314                          靠一个洗剪吹的tony老师来吹风放消息 ，离死不远  NaN   \n",
       "100750    95676  诺贝尔崔崔\\n  \\n  抢到了\\n  \\n   元素yz\\n  \\n  的红包，价值0.0...  NaN   \n",
       "100751   176206  EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...  NaN   \n",
       "100752   952575                          光天之下竞然还有如此冤案。期待党和政府还他们公正…  NaN   \n",
       "100753  1572078  第五章  万物平等，因此人当建设平等社会\\n \\n\\n 天地不仁，以万物为刍狗；圣人不仁，以...  NaN   \n",
       "100754   808153  @半岛之南2017    2018-10-11 15:08:03\\n \\n 假如真是这样，这...  NaN   \n",
       "100755   447225  今日天涯分价格：0.07元；今日点赞分是：20.96；持钻收益比为：9.26%\\n \\n 今...  NaN   \n",
       "100756   438156  @老糊涂_001 2018-09-13 11:52:58\\n \\n 赞一个！\\n \\n --...  NaN   \n",
       "100757   475002                                            主贴已赞的请回  NaN   \n",
       "100758  1397042                    这个，我不同意楼主。\\n \\n 专利保护期过来为什么不能仿制？  NaN   \n",
       "100759   610691             道德沦丧不是一个人造成的，也不是一小部分人造成的，而是多数人共同作用的结果。  NaN   \n",
       "100760  1128018                                                 马克  NaN   \n",
       "100761   498641                      因为位置的价格不一样,谁出的钱多谁就占好位置,还有活动专区  NaN   \n",
       "100762  1183017                                           我宝贝动动来了吗  NaN   \n",
       "100763   980043                                  楼主宽心，会遇见更好的，你应该高兴  NaN   \n",
       "100764  1241222                          Rosie和郭达森先生好甜~~~   儿子也好可爱  NaN   \n",
       "100765   879838  吃完晚饭，一个人穿过厦大，\\n \\n 来了厦大这么久都没有去过后门，\\n \\n 现在走过去看...  NaN   \n",
       "100766   805046                            某报明天会说：生存理念升级了。大家不要大小怪。  NaN   \n",
       "100767  1217917         baby在跑男前不是真正的明星？你是认真的吗？baby参加跑男前在香港多火你知道吗？  NaN   \n",
       "100768  1122087  所以啊，再次呼吁我们国家电视剧和电影能\\n \\n\\n 赶\\n \\n\\n 紧\\n \\n\\n 分...  NaN   \n",
       "100769   396070                     比特币洗盘之后缓缓上涨，莱特币却一直横盘，走势显得比较疲软，  NaN   \n",
       "100770   159172                    LUO  ER, 谁 第 249+1   楼呀  ？  嘿嘿嘿  NaN   \n",
       "100771   190006                               我参与了投票:内心平静,你也来表个态吧~  NaN   \n",
       "...         ...                                                ...  ...   \n",
       "102760   380996                                               楼主好人  NaN   \n",
       "102761   309169  大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...  NaN   \n",
       "102762   904441                                                继续发  NaN   \n",
       "102763   274543                              据说排名第一的钱包地址是天涯的，有几千万个  NaN   \n",
       "102764  1352259                                             哪个男的是谁  NaN   \n",
       "102765   690753                    就像马云说的假货问题是整个社会的问题，不能把问题推给企业了事。  NaN   \n",
       "102766   467114  借助区块链3.0技术，实物资产将大规模上链，映射为可确权的“数字票证”，这将带来数字经济的爆...  NaN   \n",
       "102767   580173                                            打好防守攻坚战  NaN   \n",
       "102768  1127844                                             这个女人是谁  NaN   \n",
       "102769   744679                                         顶起来，为正义喝彩！  NaN   \n",
       "102770  1491497             黑猫白猫抖一抖，公蜘母蛛满地走。\\n \\n 蛀虫多在体制内，尤其宣传教育口。  NaN   \n",
       "102771   926404  公元：2018年12月6日11时13分25秒 阴8局\\n \\n 农历：2018年10月29日...  NaN   \n",
       "102772   357229                                        你都没钻了，怎么点赞啊  NaN   \n",
       "102773  1136609                               但恩桐坚持，云告天只得加了一条附加条款。  NaN   \n",
       "102774   344996                                给你点赞了，以能量换能量不占便宜不吃亏  NaN   \n",
       "102775   608485                                        不管你说什么，就是不买  NaN   \n",
       "102776  1553392                                               强盗来了  NaN   \n",
       "102777   309368  大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...  NaN   \n",
       "102778   396154  @灿若星辰9 2018-09-12 08:45:28\\n \\n 全天在线，赞我秒回红包\\n ...  NaN   \n",
       "102779   273838  区块链这个新型的技术是不会被杀死的，任何一个新的东西出来都会经历市场的历练，就跟当时的互联网...  NaN   \n",
       "102780   933624           有饛簋飧,有捄棘匕。周道如砥,其直如矢。君子所履,小人所视。眷言顾之,潸焉出涕。  NaN   \n",
       "102781  1098546                                   所以女人的终极目标是要嫁人生子？  NaN   \n",
       "102782   575655                   最近比较忙，上贴吧少，可以联系老师扣扣或威信2563454870  NaN   \n",
       "102783   851508                                             鲁创已回红包  NaN   \n",
       "102784   729111                               只有标题，本文没几个字！鉴定完毕水贴一张  NaN   \n",
       "102785  1284232        主要是人设是美男子，他颜值不够，演的又太过用力，本身角色就挺惹人讨厌，无法让人心生怜惜  NaN   \n",
       "102786   579520                                    你的命都可以被管控，何况是房产  NaN   \n",
       "102787  1203790                                 明星演的看着假像看过了的，不精彩了。  NaN   \n",
       "102788   181214                                                 顶贴  NaN   \n",
       "102789   965661                                             谁能人肉一下  NaN   \n",
       "\n",
       "        assure                                          content_y  \\\n",
       "100742       0                                       支持楼主，楼主分析的不错   \n",
       "100743       0                    有懂行的朋友吗，浙江这边现浇楼梯多少钱，现浇楼面多少钱一平方？   \n",
       "100744       0                                    假如你是当官的，下场会比秦桧惨   \n",
       "100745       0  呵呵呵 我三套房产不包括现在父母这套住房 我怎么了 我立马在上上的星期把我其中一套住房（比较...   \n",
       "100746       0                         结伴走天涯，点赞去。\\n   \\n\\n    抢红包   \n",
       "100747       0  @匆匆那年2018ABC    2018-08-19 11:02:37\\n \\n 接近税务深...   \n",
       "100748       0  我就在想，要是我当时仔细研究一下她的那些衣服的话，我可以少走好多弯路。\\n \\n\\n 在春天...   \n",
       "100749       0                          靠一个洗剪吹的tony老师来吹风放消息 ，离死不远   \n",
       "100750       0  诺贝尔崔崔\\n  \\n  抢到了\\n  \\n   元素yz\\n  \\n  的红包，价值0.0...   \n",
       "100751       0  EOS作为一个去中心化的操作系统，支持很多开发者在其上面开发Dapp，EOS token的作...   \n",
       "100752       0                          光天之下竞然还有如此冤案。期待党和政府还他们公正…   \n",
       "100753       0  第五章  万物平等，因此人当建设平等社会\\n \\n\\n 天地不仁，以万物为刍狗；圣人不仁，以...   \n",
       "100754       0  @半岛之南2017    2018-10-11 15:08:03\\n \\n 假如真是这样，这...   \n",
       "100755       0  今日天涯分价格：0.07元；今日点赞分是：20.96；持钻收益比为：9.26%\\n \\n 今...   \n",
       "100756       0  @老糊涂_001 2018-09-13 11:52:58\\n \\n 赞一个！\\n \\n --...   \n",
       "100757       0                                            主贴已赞的请回   \n",
       "100758       0                    这个，我不同意楼主。\\n \\n 专利保护期过来为什么不能仿制？   \n",
       "100759       0             道德沦丧不是一个人造成的，也不是一小部分人造成的，而是多数人共同作用的结果。   \n",
       "100760       0                                                 马克   \n",
       "100761       0                      因为位置的价格不一样,谁出的钱多谁就占好位置,还有活动专区   \n",
       "100762       0                                           我宝贝动动来了吗   \n",
       "100763       0                                  楼主宽心，会遇见更好的，你应该高兴   \n",
       "100764       0                          Rosie和郭达森先生好甜~~~   儿子也好可爱   \n",
       "100765       0  吃完晚饭，一个人穿过厦大，\\n \\n 来了厦大这么久都没有去过后门，\\n \\n 现在走过去看...   \n",
       "100766       0                            某报明天会说：生存理念升级了。大家不要大小怪。   \n",
       "100767       0         baby在跑男前不是真正的明星？你是认真的吗？baby参加跑男前在香港多火你知道吗？   \n",
       "100768       0  所以啊，再次呼吁我们国家电视剧和电影能\\n \\n\\n 赶\\n \\n\\n 紧\\n \\n\\n 分...   \n",
       "100769       0                     比特币洗盘之后缓缓上涨，莱特币却一直横盘，走势显得比较疲软，   \n",
       "100770       0                    LUO  ER, 谁 第 249+1   楼呀  ？  嘿嘿嘿   \n",
       "100771       0                               我参与了投票:内心平静,你也来表个态吧~   \n",
       "...        ...                                                ...   \n",
       "102760       0                                               楼主好人   \n",
       "102761       0  大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...   \n",
       "102762       0                                                继续发   \n",
       "102763       0                              据说排名第一的钱包地址是天涯的，有几千万个   \n",
       "102764       0                                             哪个男的是谁   \n",
       "102765       0                    就像马云说的假货问题是整个社会的问题，不能把问题推给企业了事。   \n",
       "102766       0  借助区块链3.0技术，实物资产将大规模上链，映射为可确权的“数字票证”，这将带来数字经济的爆...   \n",
       "102767       0                                            打好防守攻坚战   \n",
       "102768       0                                             这个女人是谁   \n",
       "102769       0                                         顶起来，为正义喝彩！   \n",
       "102770       0             黑猫白猫抖一抖，公蜘母蛛满地走。\\n \\n 蛀虫多在体制内，尤其宣传教育口。   \n",
       "102771       0  公元：2018年12月6日11时13分25秒 阴8局\\n \\n 农历：2018年10月29日...   \n",
       "102772       0                                        你都没钻了，怎么点赞啊   \n",
       "102773       0                               但恩桐坚持，云告天只得加了一条附加条款。   \n",
       "102774       0                                给你点赞了，以能量换能量不占便宜不吃亏   \n",
       "102775       0                                        不管你说什么，就是不买   \n",
       "102776       0                                               强盗来了   \n",
       "102777       0  大家一起把涯钻的事业做起来。有能量的点赞，无能量的盖楼，钻多的发红包，钻少的发贴子，有区块链...   \n",
       "102778       0  @灿若星辰9 2018-09-12 08:45:28\\n \\n 全天在线，赞我秒回红包\\n ...   \n",
       "102779       0  区块链这个新型的技术是不会被杀死的，任何一个新的东西出来都会经历市场的历练，就跟当时的互联网...   \n",
       "102780       0           有饛簋飧,有捄棘匕。周道如砥,其直如矢。君子所履,小人所视。眷言顾之,潸焉出涕。   \n",
       "102781       0                                   所以女人的终极目标是要嫁人生子？   \n",
       "102782       0                   最近比较忙，上贴吧少，可以联系老师扣扣或威信2563454870   \n",
       "102783       0                                             鲁创已回红包   \n",
       "102784       0                               只有标题，本文没几个字！鉴定完毕水贴一张   \n",
       "102785       0        主要是人设是美男子，他颜值不够，演的又太过用力，本身角色就挺惹人讨厌，无法让人心生怜惜   \n",
       "102786       0                                    你的命都可以被管控，何况是房产   \n",
       "102787       0                                 明星演的看着假像看过了的，不精彩了。   \n",
       "102788       0                                                 顶贴   \n",
       "102789       0                                             谁能人肉一下   \n",
       "\n",
       "                                                   vector   ss  predict  \n",
       "100742  [-0.00050121656, 0.004646495, 0.00049711263, -... -1.0      1.0  \n",
       "100743  [-0.00021289136, -0.0002112096, 0.00020131774,... -1.0      0.0  \n",
       "100744  [-0.00041641563, 0.004448106, 0.0007133413, -0... -1.0      1.0  \n",
       "100745  [-0.0001965089, 0.0002559799, -0.00016521648, ... -1.0      1.0  \n",
       "100746  [-0.00012122595, -0.00019975816, -9.6026626e-0... -1.0      1.0  \n",
       "100747  [-0.00015112055, -0.00021713384, 4.3946588e-05... -1.0      0.0  \n",
       "100748  [0.00014590095, -2.36256e-05, -5.815563e-06, 7... -1.0      1.0  \n",
       "100749  [2.177861e-06, 0.00014500064, -2.8838269e-05, ... -1.0      1.0  \n",
       "100750  [-0.00053806993, 0.004573738, 0.00034990904, -... -1.0      1.0  \n",
       "100751  [5.1506522e-05, -5.8857413e-06, 1.2539855e-05,... -1.0      1.0  \n",
       "100752  [5.0086005e-06, -5.9196904e-05, 0.00014237005,... -1.0      1.0  \n",
       "100753  [0.00023834412, 0.00014466436, 9.70999e-06, -9... -1.0      0.0  \n",
       "100754  [-0.00022901272, 0.00014568881, 5.5341217e-05,... -1.0      1.0  \n",
       "100755  [-0.00020383063, 0.00023106053, -9.111098e-05,... -1.0      0.0  \n",
       "100756  [9.605871e-05, -0.00023950999, 5.6189332e-05, ... -1.0      1.0  \n",
       "100757  [-0.0009158363, 0.0066905254, 0.0005661698, -0... -1.0      1.0  \n",
       "100758  [-0.00015296177, -0.00022891088, -0.0001389691... -1.0      1.0  \n",
       "100759  [7.515958e-05, 9.5837815e-05, -0.00018056587, ... -1.0      1.0  \n",
       "100760  [0.00017363396, 0.00016908465, 0.00012319739, ... -1.0      0.0  \n",
       "100761  [-0.00016200649, 8.652916e-05, 6.718655e-05, 8... -1.0      1.0  \n",
       "100762  [-0.00049750454, 0.0019457284, 0.00022722762, ... -1.0      1.0  \n",
       "100763  [0.00020053485, -0.0002160742, 0.00017019118, ... -1.0      0.0  \n",
       "100764  [-0.00017903057, 0.0001622189, -0.00022155614,... -1.0      0.0  \n",
       "100765  [-0.00011312829, -0.00023002345, -0.0001825704... -1.0      1.0  \n",
       "100766  [-1.365614e-05, 2.8391225e-05, -0.00021587127,... -1.0      0.0  \n",
       "100767  [-0.0001810392, 0.00022770114, 0.000116205876,... -1.0      0.0  \n",
       "100768  [3.038715e-05, 3.3963806e-05, -1.3680326e-05, ... -1.0      0.0  \n",
       "100769  [3.211969e-05, -0.00015681396, 1.900563e-05, 0... -1.0      0.0  \n",
       "100770  [-0.00017184012, 1.6387998e-05, -3.682658e-05,... -1.0      1.0  \n",
       "100771  [5.7232e-05, -4.43334e-06, -0.00019098798, 3.0... -1.0      1.0  \n",
       "...                                                   ...  ...      ...  \n",
       "102760  [1.2418393e-05, -9.22599e-05, -0.00022082537, ... -1.0      0.0  \n",
       "102761  [0.00020369448, 0.00022521541, 0.00019723042, ... -1.0      0.0  \n",
       "102762  [9.0072135e-05, 8.114723e-05, 6.366512e-05, -8... -1.0      1.0  \n",
       "102763  [-0.0004768259, 0.0049025207, 0.0006336634, -0... -1.0      1.0  \n",
       "102764  [0.00011036501, -8.9452784e-05, 4.2348955e-05,... -1.0      1.0  \n",
       "102765  [-0.0008231468, 0.0048623886, 0.0005516676, -0... -1.0      1.0  \n",
       "102766  [-6.13557e-05, -0.00015381981, 0.0001829403, -... -1.0      1.0  \n",
       "102767  [-6.63767e-05, -0.0001055677, 0.00015874115, -... -1.0      1.0  \n",
       "102768  [-4.789428e-05, -0.00020672627, -0.00013019978... -1.0      0.0  \n",
       "102769  [-0.00084950624, 0.0048679304, 0.0006878436, -... -1.0      1.0  \n",
       "102770  [-4.11761e-05, -3.5230845e-05, 0.00012575409, ... -1.0      1.0  \n",
       "102771  [0.00012963584, 0.00012855428, -0.00014557735,... -1.0      0.0  \n",
       "102772  [-0.0001524354, 6.217486e-05, 0.00019323081, 0... -1.0      1.0  \n",
       "102773  [0.00020328503, 0.00039947787, 3.8845763e-05, ... -1.0      1.0  \n",
       "102774  [-0.00017894346, -0.0002214486, 0.00011141604,... -1.0      1.0  \n",
       "102775  [-0.0005581288, 0.004712583, 0.0004242073, -0.... -1.0      1.0  \n",
       "102776  [0.0001694109, -0.00023546406, -3.9594e-05, -0... -1.0      0.0  \n",
       "102777  [0.00020369448, 0.00022521541, 0.00019723042, ... -1.0      0.0  \n",
       "102778  [0.00017237266, -0.00012719107, 8.0563535e-05,... -1.0      1.0  \n",
       "102779  [-1.5643005e-05, -0.0001529364, -0.00017457732... -1.0      0.0  \n",
       "102780  [-0.00015580615, 0.00013681517, 0.00010879679,... -1.0      0.0  \n",
       "102781  [-0.00018765307, -3.1531177e-05, 0.00013083006... -1.0      1.0  \n",
       "102782  [1.0329338e-05, -5.8130994e-05, 4.8842303e-05,... -1.0      1.0  \n",
       "102783  [-8.7861634e-05, 0.00013935432, -2.9601068e-05... -1.0      1.0  \n",
       "102784  [-0.00045963874, 0.004632451, 0.0003776469, -0... -1.0      1.0  \n",
       "102785  [-0.00015876505, -3.1368076e-05, -0.0001053466... -1.0      0.0  \n",
       "102786  [-0.00041723697, 0.004586449, 0.00037609783, -... -1.0      1.0  \n",
       "102787  [-0.0004734579, 0.0047924956, 0.00065743795, -... -1.0      1.0  \n",
       "102788  [-0.00015331614, -1.1328045e-05, 0.00021822979... -1.0      0.0  \n",
       "102789  [-6.006425e-05, 7.563771e-06, -2.4452002e-05, ... -1.0      1.0  \n",
       "\n",
       "[2048 rows x 8 columns]"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unlabeled['predict'] = y_predict\n",
    "unlabeled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "(rid, content_x, tag, assure, content_y, vector, ss, predict) = unlabeled.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59691"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "ds",
   "language": "python",
   "name": "ds"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
